Experiment: Translate the dataset 10 units at a time and see how the logFC values change
Used only SFARI genes to make the calculations faster
The adjusted p-value is not affected by the gene expression of the dataset but the logFC is. The bigger the translation, the smaller the number of genes with a significant logFC
get_logFC = function(datExpr, trans){
datExpr_SFARI = datExpr %>% filter(rownames(datExpr) %in% SFARI_genes$ID)
rownames(datExpr_SFARI) = rownames(datExpr)[rownames(datExpr) %in% SFARI_genes$ID]
datExpr_translated = datExpr_SFARI + 10*trans
datProbes_SFARI = datProbes %>% filter(rownames(datExpr) %in% SFARI_genes$ID)
counts = as.matrix(datExpr_translated)
rowRanges = GRanges(datProbes_SFARI$chromosome_name,
IRanges(datProbes_SFARI$start_position, width=datProbes_SFARI$length),
strand=datProbes_SFARI$strand,
feature_id=datProbes_SFARI$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
ddsSE = DESeqDataSet(se, design =~Diagnosis_)
dds = DESeq(ddsSE)
DE_info = results(dds) %>% data.frame %>% rownames_to_column(var = 'ID') %>%
mutate('logFC'=log2FoldChange, 'translation'=10*trans) %>%
dplyr::select(ID, logFC, padj, translation)
return(DE_info)
}
logFC_results = get_logFC(datExpr, 0)
for(trans in seq(1,10)){
new = get_logFC(datExpr, trans)
logFC_results = bind_rows(logFC_results, new)
}
p1 = ggplotly(ggplot(logFC_results, aes(translation, abs(logFC), group=translation, fill=translation)) +
geom_boxplot() + theme_minimal())
p2 = ggplotly(ggplot(logFC_results, aes(translation, padj, group=translation, fill=translation)) +
geom_boxplot() + theme_minimal() +
ggtitle('LogFC (left) and adj p-val (right) for different translations of the data'))
subplot(p1, p2, nrows=1)
significant = logFC_results %>% filter(abs(logFC)>log2(1.2) & padj<0.05) %>% group_by(translation) %>% tally
ggplotly(ggplot(significant, aes(translation, n, fill=translation)) + geom_bar(stat='identity') +
theme_minimal() + ggtitle('Number of DE genes for each translation'))
rm(new, p1, p2, logFC_results, significant)
The gene expression grouped by SFARI could be approximated by a logNormal distribution … kind of…
all_points = datExpr %>% mutate('ID' = rownames(datExpr)) %>% left_join(SFARI_genes, by='ID') %>%
filter(!is.na(`gene-score`)) %>% mutate(`gene-score` = as.factor(`gene-score` )) %>%
dplyr::select(c(colnames(datExpr), `gene-score`)) %>% melt
## Using gene-score as id variables
ggplotly(ggplot(all_points, aes(value, group=`gene-score`, fill=`gene-score`, color=`gene-score`)) +
geom_density(alpha=0.3, color=NA) + scale_x_continuous(trans='log2') + theme_minimal() +
ggtitle('Gene expression grouped by SFARI score'))